This document demonstrates a simulation study designed to evaluate
the bias in estimating the targeting effect in psychological studies.
The analysis compares four different design strategies: -
Trait-Treatment Interaction - Selected
Population Design - Matched-mismatched Design
- Trait-only Design In the simulation, we generate data
using the gen_data() function, which models the
relationship between a trait, treatment condition, and the outcome
variable. We simulate different experimental designs through the
get_results() function, extracting the key coefficient for
each design.
We begin by setting up the necessary libraries and setting a seed for reproducibility.
library(dplyr)
library(ggplot2)
library(plotly)
library(gridExtra)
library(foreach)
library(doParallel)
library(readr)
set.seed(123)
Next, we define functions to generate the data, run different experimental designs, and analyze results through a grid search over different parameter combinations. Our main functions are:
gen_data(): Generates synthetic data with specified
parametersget_results(): Implements and compares four different
design strategiesgrid_search(): Performs parameter sweep across
different effect combinationsplot_3d_bias(): Visualizes the bias in estimating the
targeting effect# Define data generating function
gen_data <- function(n, b0, b1, b2, b3) {
# Generate trait values and conditions
trait <- rnorm(n) # Measured without error for simplicity
condition <- rbinom(n, 1, 0.5) # Assuming participants are assigned at random
# Generate outcome from main effects and interaction effects
y <- b0 + b1 * trait + b2 * condition + b3 * trait * condition + rnorm(n)
# Return as data frame
return(data.frame(trait, condition, y))
}
# Define function to run all designs
get_results <- function(data) {
# Run design one (Trait-Treatment Interaction)
model <- lm(y ~ trait * condition, data = data)
# Extract coefficient for interaction term
b3_design1 <- coef(model)["trait:condition"]
# Run design two (Selected Population Design)
d_design2 <- data[data$trait > quantile(data$trait, 0.75), ] # Preselect audience for high trait values
# Regress outcome on condition
model <- lm(y ~ condition, data = d_design2)
# Extract coefficient for condition
b3_design2 <- coef(model)["condition"]
# Run design 3 (Matched-mismatched design)
data <- data %>%
mutate(matched = case_when(
(trait > quantile(trait, .75) & condition == 1) | (trait <= quantile(trait, .25) & condition == 0) ~ 1,
(trait > quantile(trait, .75) & condition == 0) | (trait <= quantile(trait, .25) & condition == 1) ~ 0,
TRUE ~ NA_real_ # This should never happen
))
# Regress outcome in matching condition
model <- lm(y ~ matched, data = data)
# Extract coefficient for matched condition
b3_design3 <- coef(model)["matched"]
# Run design 4 (Trait-only design)
d_design4 <- data %>%
filter(condition == 1) # Only present tailored at
# Run regression
model <- lm(y ~ trait, data = d_design4)
# Extract coefficient for trait
b3_design4 <- coef(model)["trait"]
# Return all coefficients as a dataframe
return(data.frame(
design = c("Trait-Treatment Interaction", "Selected Population Design", "Matched-mismatched Design", "Trait-only Design"),
b3 = c(b3_design1, b3_design2, b3_design3, b3_design4)
))
}
# Define grid search function to search through different combinations of main effects
grid_search <- function(grid, n, b3, n_designs) {
results <- foreach(i = 1:nrow(grid), .combine = rbind, .packages = c("dplyr")) %dopar% {
# Initialize results vector for this iteration
result_row <- numeric(n_designs + ncol(grid))
# Extract main effects from grid
b0 <- grid[i, "b0"]
b1 <- grid[i, "b1"]
b2 <- grid[i, "b2"]
# Generate data
data <- gen_data(n, b0, b1, b2, b3)
# Run all designs
design_results <- get_results(data)
# Store results in vector
result_row[1:n_designs] <- as.numeric(design_results$b3)
result_row[(n_designs + 1):length(result_row)] <- as.numeric(grid[i, ])
result_row
}
# Convert to dataframe and add column names
results <- as.data.frame(results)
colnames(results) <- c(paste0("b3_design", 1:n_designs), colnames(grid))
return(results)
}
plot_3d_bias <- function(results, true_b3) {
# Create matrices for each design
b1_unique <- unique(results$b1)
b2_unique <- unique(results$b2)
# Create empty matrices for each design
z_matrices <- list()
for(i in 1:4) {
z_matrices[[i]] <- matrix(NA, nrow = length(b1_unique), ncol = length(b2_unique))
design_col <- paste0("b3_design", i)
# Fill matrices with bias (estimated - true)
for(j in seq_along(b1_unique)) {
for(k in seq_along(b2_unique)) {
idx <- results$b1 == b1_unique[j] & results$b2 == b2_unique[k]
z_matrices[[i]][j,k] <- results[idx, design_col] - true_b3
}
}
}
# Create plot
p <- plot_ly()
# Add surface for each design
designs <- c("Trait-Treatment Interaction",
"Selected Population Design",
"Matched-mismatched Design",
"Trait-only Design")
colors <- c("black", "#27687B", "#64277B", "#7B3A27")
for(i in 1:4) {
p <- add_surface(p,
x = b1_unique,
y = b2_unique,
z = z_matrices[[i]],
opacity = ifelse(i == 1, 1, 0.5),
colorscale = list(c(0,1), c(colors[i], colors[i])),
showscale = FALSE,
showlegend = TRUE,
name = designs[i],
legendgroup = designs[i])
}
# Add layout details
p <- layout(p,
scene = list(
xaxis = list(title = list(text = "Trait main effect", font = list(size = 20))),
yaxis = list(title = list(text = "Ad condition main effect", font = list(size = 20))),
zaxis = list(title = list(text = "Bias in targeting effect", font = list(size = 20)))
),
showlegend = F,
margin = list(r = 120), # Add right margin for legend
title = NULL)
return(p)
}
Finally, we set up global parameters (e.g., simulation sample size, true effect value, and grid search parameters) that drive the subsequent analysis.
# Set global parameters to run the simulation
n <- 10000000 # Asymptotic sample size (ensure this is big enough so that even subset-designs have asymptotic n)
b3 <- .5
n_designs <- 4
grid <- expand.grid(
#b0 = seq(-1, 1, by = 0.1),
b0 = 0,
b1 = c(-.25, 0, .25),
b2 = c(-.25, 0, .25)
)
We implement parallel processing using the doParallel
package to efficiently run the grid search over combinations of main
effects. The grid_search() function iterates over the grid,
simulating data and calculating estimates for each design.
# Register parallel backend
cl <- makeCluster(10) # Do not use all your cores, this get's *very* RAM intensive
clusterExport(cl, c("gen_data", "get_results"))
registerDoParallel(cl)
# Run the grid search
results <- grid_search(grid, n, b3, n_designs)
# Stop backend
stopCluster(cl)
# Print results
head(results)
## b3_design1 b3_design2 b3_design3 b3_design4 b0 b1 b2
## result.1 0.5000998 0.3856400 0.6359835 0.2501963 0 -0.25 -0.25
## result.2 0.5008665 0.3868806 0.6367567 0.4996505 0 0.00 -0.25
## result.3 0.4994163 0.3846635 0.6338892 0.7501552 0 0.25 -0.25
## result.4 0.4997393 0.6334542 0.6346612 0.2499306 0 -0.25 0.00
## result.5 0.5004741 0.6366525 0.6360419 0.5001736 0 0.00 0.00
## result.6 0.5008734 0.6356013 0.6362474 0.7505158 0 0.25 0.00
The plot_3d_bias() function constructs a 3D plot using
the plotly library to visualize the bias (difference between estimated
and true effects) over different values of trait and ad condition main
effects. This aids in understanding how design choices influence the
estimation bias.
# Create the plot
fig <- plot_3d_bias(results, b3)
fig
Finally, we compute and report the mean absolute error (MAE) for each design. The MAE quantifies the average deviation of the estimated effects from the true value, providing a benchmark for the performance of each design.
# Calculate mean absolute error for all designs
mae <- apply(results[, 1:4], 2, function(x) mean(abs(x - b3)))
# Print results
cat("Mean absolute error for all designs:\n")
## Mean absolute error for all designs:
mae
## b3_design1 b3_design2 b3_design3 b3_design4
## 0.0004408169 0.2119067653 0.1357041735 0.1667980365